<?php

/*
 * Copyright (C) 2009 - 2011 Pham Cong Dinh
 *
 * This file is part of Spica.
 *
 * This is free software; you can redistribute it and/or modify it
 * under the terms of the GNU Lesser General Public License as
 * published by the Free Software Foundation; either version 3 of
 * the License, or (at your option) any later version.
 *
 * This software is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
 * Lesser General Public License for more details.
 *
 * You should have received a copy of the GNU Lesser General Public
 * License along with this software; if not, write to the Free
 * Software Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA
 * 02110-1301 USA, or see the FSF site: http://www.fsf.org.
*/

/**
 * The <code>SpicaHtmlUtils</code> instances are used to process HTML string.
 *
 * namespace spica\core\filter\HtmlUtils
 *
 * @category   spica
 * @package    core
 * @subpackage filter
 * @author     Pham Cong Dinh <pcdinh at phpvietnam dot net>
 * @since      Version 0.3
 * @since      January 30, 2010
 * @copyright  Pham Cong Dinh (http://www.phpvietnam.net)
 * @license    http://www.gnu.org/licenses/lgpl-3.0.txt
 * @version    $Id: HtmlUtils.php 1869 2011-01-07 18:55:25Z pcdinh $
 */
class SpicaHtmlUtils
{
    /**
     * Grabs all URL on a web page that is found in the <a> tag.
     *
     * @param  string $url
     * @return array
     */
    public static function getLinks($url)
    {
        $matches = array();

        if (strtolower(substr($url, 0, 7)) == 'http://')
        {
            $content  = trim(file_get_contents($url));
            $stripped = strip_tags($content, "<a>");
            preg_match_all("/<a(?:[^>]*)href=\"([^\"]*)\"(?:[^>]*)>([^\"]*)<\/a>/is", $stripped, $matches);
            return $matches;
        }

        return array();
    }

    /**
     * Makes HTML input (possibly come from a textarea) safe from HTML and pretty
     * to print on a web page, paragraphs are reserved.
     *
     * @param  string $html
     * @return string
     */
    public static function normalizeHtml($html)
    {
        // normalize newlines
        $html = str_replace("\r", "\n", $html);
        $html = preg_replace("@\n\n+@", "\n", $html);

        // escape HTML
        $html = htmlentities($html, ENT_QUOTES, 'UTF-8');

        // make paragraphs
        $lines = explode("\n", $html);
        foreach ($lines as $key => $line)
        {
            $lines[$key] = "<p>{$line}</p>";
        }

        return implode('', $lines);
    }

    /**
     * Converts some HTML entities back to real HTML tags.
     *
     * @param  string $text
     * @param  bool $all
     * @return string
     */
    public static function unnormalizeHtml($text, $all = false)
    {
        if (true === $all)
        {
            $trans = get_html_translation_table(HTML_ENTITIES);
            return stripslashes(strtr($html, array_flip($trans)));
        }

        $text = preg_replace('!&lt;em&gt;(.*?)&lt;/em&gt;!m', '<em>$1</em>', $text);
        $text = preg_replace('!^&lt;blockquote&gt;(?:&lt;p&gt;)?(.*?)(?:&lt;\/p&gt;)?&lt;\/blockquote&gt;$!m', '<blockquote><p>$1</p></blockquote>', $text);
        $text = preg_replace('!&lt;a +href=&quot;(.*?)&quot;(?: +title=&quot;(.*?)&quot;)? *&gt;(.*?)&lt;/a&gt;!m', '<a href="$1" title="$2">$3</a>', $text);
        return preg_replace('!&lt;a +href=&quot;((?:ht|f)tps?://.*?)&quot;(?: +title=&quot;(.*?)&quot;)? *&gt;(.*?)&lt;/a&gt;!m', '<a href="$1" title="$2">$3</a>', $text);
    }

    /**
     * Filters links in a string by nofollow string.
     *
     * @param  string $string
     * @return string
     */
    public static function nofollow($string)
    {
        return preg_replace("/((http|https|ftp):\/\/[a-z0-9;\/\?:@=\&\$\-_\.\+!*'\(\),~%#]+)/i", "<a href=\"$1\" rel=\"nofollow\">$1</a>", $string);
    }

    /**
     * Escapes XHTML entities and UTF-8 characters to make it safe to save into database
     * and retrieve them later. Some characters are not covered by html standards
     * since they are above ascii code 126.
     * See here for the full code chart: http://www.ascii.cl/htmlcodes.htm
     *
     * @param  string $string
     * @return string
     */
    public static function escape($text)
    {
        return mb_convert_encoding($text, 'HTML-ENTITIES', "UTF-8");
    }

    /**
     * Replaces quotes with equivalent HTML entities to make it possible to put them into
     * attribute values.
     *
     * @param  string $str
     * @return string
     */
    public static function escapeQuotes($str)
    {
        return str_replace(array("\'", "\"", "'", '"'), array("&#39;", "&quot;", "&#39;", "&quot;"), $str);
    }

    /**
     * Replaces HTML entities for quotes with real quotes (" and ').
     *
     * @param  string $str
     * @return string
     */
    public static function unescapeQuotes($str)
    {
        return str_replace(array("&#39;", "&quot;"), array("'", '"'), $str);
    }
    
    /**
     * Replaces Workpress quotes � and � with common plain text quotes " and " respectively.
     *
     * @param  string $str
     * @return string
     */
    public static function normalizeWpQuotes($str)
    {
        return str_replace(array("�", "�"), array('"', '"'), $str);
    }
}

?>
